import os
import numpy as np
import pandas as pd
import requests
import json
import re
import seaborn as sns
from IPython.display import IFrame
from wordcloud import WordCloud
from wordcloud import ImageColorGenerator
from wordcloud import STOPWORDS
from PIL import Image
from io import BytesIO
import matplotlib.pyplot as plt
%matplotlib inline
The dataset that I will be wrangling (and analyzing and visualizing) is the tweet archive of Twitter user @dog_rates, also known as WeRateDogs. WeRateDogs is a Twitter account that rates people's dogs with a humorous comment about the dog. These ratings almost always have a denominator of 10. The numerators, though? Almost always greater than 10. 11/10, 12/10, 13/10, etc. Why? Because "they're good dogs Brent." WeRateDogs has over 4 million followers and has received international media coverage. This archive contains basic tweet data (tweet ID, timestamp, text, etc.) for all 5000+ of their tweets as they stood on August 1, 2017. Datasets used:
twitter-archive-enhanced.csv... Downloaded Manuallyimage_predictions.tsv ... Downloaded programaticallytweet._json.txt ... Downloaded manually due to my unsuccessful application of Twitter Developer accountGATHERING DATA
# Downloading the image-prediction files programmatically.
url = "https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv"
response = requests.get(url)
folder_name = 'image_predict'
# Make directory if it doesn't already exist
if not os.path.exists(folder_name):
os.makedirs(folder_name)
# writing image-prediction contents to image_predict folder
with open(os.path.join(folder_name, url.split("/")[-1]), mode="wb") as file:
file.write(response.content)
Reading imported and downloaded files into pandas dataframe
# twitter archive dataset
twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')
# nueral network dataset
image_predict = pd.read_csv('image_predict/image-predictions.tsv', sep='\t')
# Reading the json file
tweets_list = []
with open('tweets/tweet_json.txt', encoding='UTF-8') as json_file:
for tweets in json_file:
tweet = json.loads(tweets)
# Extract variables from the 'tweet' dictionary
tweet_id = tweet['id']
retweet_count = tweet['retweet_count']
fav_count = tweet['favorite_count']
# Create a dictionary with the JSON data and add it to the 'tweets_list.'
tweets_list.append({'tweet_id':tweet_id,
'retweet_count': retweet_count,
'favorite_count': fav_count})
# Convert the data in the 'tweets_data' list to a DataFrame
df_tweet = pd.DataFrame(tweets_list,
columns = ['tweet_id',
'retweet_count',
'favorite_count'])
ASSESSING DATA
VISUAL ASSESSMENT
twitter_archive
| tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 892420643555336193 | NaN | NaN | 2017-08-01 16:23:56 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Phineas. He's a mystical boy. Only eve... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/892420643... | 13 | 10 | Phineas | None | None | None | None |
| 1 | 892177421306343426 | NaN | NaN | 2017-08-01 00:17:27 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Tilly. She's just checking pup on you.... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/892177421... | 13 | 10 | Tilly | None | None | None | None |
| 2 | 891815181378084864 | NaN | NaN | 2017-07-31 00:18:03 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Archie. He is a rare Norwegian Pouncin... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891815181... | 12 | 10 | Archie | None | None | None | None |
| 3 | 891689557279858688 | NaN | NaN | 2017-07-30 15:58:51 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Darla. She commenced a snooze mid meal... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891689557... | 13 | 10 | Darla | None | None | None | None |
| 4 | 891327558926688256 | NaN | NaN | 2017-07-29 16:00:24 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Franklin. He would like you to stop ca... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891327558... | 12 | 10 | Franklin | None | None | None | None |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2351 | 666049248165822465 | NaN | NaN | 2015-11-16 00:24:50 +0000 | <a href="http://twitter.com/download/iphone" r... | Here we have a 1949 1st generation vulpix. Enj... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666049248... | 5 | 10 | None | None | None | None | None |
| 2352 | 666044226329800704 | NaN | NaN | 2015-11-16 00:04:52 +0000 | <a href="http://twitter.com/download/iphone" r... | This is a purebred Piers Morgan. Loves to Netf... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666044226... | 6 | 10 | a | None | None | None | None |
| 2353 | 666033412701032449 | NaN | NaN | 2015-11-15 23:21:54 +0000 | <a href="http://twitter.com/download/iphone" r... | Here is a very happy pup. Big fan of well-main... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666033412... | 9 | 10 | a | None | None | None | None |
| 2354 | 666029285002620928 | NaN | NaN | 2015-11-15 23:05:30 +0000 | <a href="http://twitter.com/download/iphone" r... | This is a western brown Mitsubishi terrier. Up... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666029285... | 7 | 10 | a | None | None | None | None |
| 2355 | 666020888022790149 | NaN | NaN | 2015-11-15 22:32:08 +0000 | <a href="http://twitter.com/download/iphone" r... | Here we have a Japanese Irish Setter. Lost eye... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666020888... | 8 | 10 | None | None | None | None | None |
2356 rows × 17 columns
image_predict
| tweet_id | jpg_url | img_num | p1 | p1_conf | p1_dog | p2 | p2_conf | p2_dog | p3 | p3_conf | p3_dog | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 666020888022790149 | https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg | 1 | Welsh_springer_spaniel | 0.465074 | True | collie | 0.156665 | True | Shetland_sheepdog | 0.061428 | True |
| 1 | 666029285002620928 | https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg | 1 | redbone | 0.506826 | True | miniature_pinscher | 0.074192 | True | Rhodesian_ridgeback | 0.072010 | True |
| 2 | 666033412701032449 | https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg | 1 | German_shepherd | 0.596461 | True | malinois | 0.138584 | True | bloodhound | 0.116197 | True |
| 3 | 666044226329800704 | https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg | 1 | Rhodesian_ridgeback | 0.408143 | True | redbone | 0.360687 | True | miniature_pinscher | 0.222752 | True |
| 4 | 666049248165822465 | https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg | 1 | miniature_pinscher | 0.560311 | True | Rottweiler | 0.243682 | True | Doberman | 0.154629 | True |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2070 | 891327558926688256 | https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg | 2 | basset | 0.555712 | True | English_springer | 0.225770 | True | German_short-haired_pointer | 0.175219 | True |
| 2071 | 891689557279858688 | https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg | 1 | paper_towel | 0.170278 | False | Labrador_retriever | 0.168086 | True | spatula | 0.040836 | False |
| 2072 | 891815181378084864 | https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg | 1 | Chihuahua | 0.716012 | True | malamute | 0.078253 | True | kelpie | 0.031379 | True |
| 2073 | 892177421306343426 | https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg | 1 | Chihuahua | 0.323581 | True | Pekinese | 0.090647 | True | papillon | 0.068957 | True |
| 2074 | 892420643555336193 | https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg | 1 | orange | 0.097049 | False | bagel | 0.085851 | False | banana | 0.076110 | False |
2075 rows × 12 columns
df_tweet
| tweet_id | retweet_count | favorite_count | |
|---|---|---|---|
| 0 | 892420643555336193 | 8853 | 39467 |
| 1 | 892177421306343426 | 6514 | 33819 |
| 2 | 891815181378084864 | 4328 | 25461 |
| 3 | 891689557279858688 | 8964 | 42908 |
| 4 | 891327558926688256 | 9774 | 41048 |
| ... | ... | ... | ... |
| 2349 | 666049248165822465 | 41 | 111 |
| 2350 | 666044226329800704 | 147 | 311 |
| 2351 | 666033412701032449 | 47 | 128 |
| 2352 | 666029285002620928 | 48 | 132 |
| 2353 | 666020888022790149 | 532 | 2535 |
2354 rows × 3 columns
PROGRAMATIC ASSESSMENT
twitter_archive.describe()
| tweet_id | in_reply_to_status_id | in_reply_to_user_id | retweeted_status_id | retweeted_status_user_id | rating_numerator | rating_denominator | |
|---|---|---|---|---|---|---|---|
| count | 2.356000e+03 | 7.800000e+01 | 7.800000e+01 | 1.810000e+02 | 1.810000e+02 | 2356.000000 | 2356.000000 |
| mean | 7.427716e+17 | 7.455079e+17 | 2.014171e+16 | 7.720400e+17 | 1.241698e+16 | 13.126486 | 10.455433 |
| std | 6.856705e+16 | 7.582492e+16 | 1.252797e+17 | 6.236928e+16 | 9.599254e+16 | 45.876648 | 6.745237 |
| min | 6.660209e+17 | 6.658147e+17 | 1.185634e+07 | 6.661041e+17 | 7.832140e+05 | 0.000000 | 0.000000 |
| 25% | 6.783989e+17 | 6.757419e+17 | 3.086374e+08 | 7.186315e+17 | 4.196984e+09 | 10.000000 | 10.000000 |
| 50% | 7.196279e+17 | 7.038708e+17 | 4.196984e+09 | 7.804657e+17 | 4.196984e+09 | 11.000000 | 10.000000 |
| 75% | 7.993373e+17 | 8.257804e+17 | 4.196984e+09 | 8.203146e+17 | 4.196984e+09 | 12.000000 | 10.000000 |
| max | 8.924206e+17 | 8.862664e+17 | 8.405479e+17 | 8.874740e+17 | 7.874618e+17 | 1776.000000 | 170.000000 |
twitter_archive.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2356 entries, 0 to 2355 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 tweet_id 2356 non-null int64 1 in_reply_to_status_id 78 non-null float64 2 in_reply_to_user_id 78 non-null float64 3 timestamp 2356 non-null object 4 source 2356 non-null object 5 text 2356 non-null object 6 retweeted_status_id 181 non-null float64 7 retweeted_status_user_id 181 non-null float64 8 retweeted_status_timestamp 181 non-null object 9 expanded_urls 2297 non-null object 10 rating_numerator 2356 non-null int64 11 rating_denominator 2356 non-null int64 12 name 2356 non-null object 13 doggo 2356 non-null object 14 floofer 2356 non-null object 15 pupper 2356 non-null object 16 puppo 2356 non-null object dtypes: float64(4), int64(3), object(10) memory usage: 313.0+ KB
twitter_archive.doggo.value_counts()
None 2259 doggo 97 Name: doggo, dtype: int64
twitter_archive.query('rating_numerator > 100')
| tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 188 | 855862651834028034 | 8.558616e+17 | 1.943518e+08 | 2017-04-22 19:15:32 +0000 | <a href="http://twitter.com/download/iphone" r... | @dhmontgomery We also gave snoop dogg a 420/10... | NaN | NaN | NaN | NaN | 420 | 10 | None | None | None | None | None |
| 189 | 855860136149123072 | 8.558585e+17 | 1.361572e+07 | 2017-04-22 19:05:32 +0000 | <a href="http://twitter.com/download/iphone" r... | @s8n You tried very hard to portray this good ... | NaN | NaN | NaN | NaN | 666 | 10 | None | None | None | None | None |
| 290 | 838150277551247360 | 8.381455e+17 | 2.195506e+07 | 2017-03-04 22:12:52 +0000 | <a href="http://twitter.com/download/iphone" r... | @markhoppus 182/10 | NaN | NaN | NaN | NaN | 182 | 10 | None | None | None | None | None |
| 313 | 835246439529840640 | 8.352460e+17 | 2.625958e+07 | 2017-02-24 21:54:03 +0000 | <a href="http://twitter.com/download/iphone" r... | @jonnysun @Lin_Manuel ok jomny I know you're e... | NaN | NaN | NaN | NaN | 960 | 0 | None | None | None | None | None |
| 902 | 758467244762497024 | NaN | NaN | 2016-07-28 01:00:57 +0000 | <a href="http://twitter.com/download/iphone" r... | Why does this never happen at my front door...... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/758467244... | 165 | 150 | None | None | None | None | None |
| 979 | 749981277374128128 | NaN | NaN | 2016-07-04 15:00:45 +0000 | <a href="https://about.twitter.com/products/tw... | This is Atticus. He's quite simply America af.... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/749981277... | 1776 | 10 | Atticus | None | None | None | None |
| 1120 | 731156023742988288 | NaN | NaN | 2016-05-13 16:15:54 +0000 | <a href="http://twitter.com/download/iphone" r... | Say hello to this unbelievably well behaved sq... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/731156023... | 204 | 170 | this | None | None | None | None |
| 1634 | 684225744407494656 | 6.842229e+17 | 4.196984e+09 | 2016-01-05 04:11:44 +0000 | <a href="http://twitter.com/download/iphone" r... | Two sneaky puppers were not initially seen, mo... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/684225744... | 143 | 130 | None | None | None | None | None |
| 1635 | 684222868335505415 | NaN | NaN | 2016-01-05 04:00:18 +0000 | <a href="http://twitter.com/download/iphone" r... | Someone help the girl is being mugged. Several... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/684222868... | 121 | 110 | None | None | None | None | None |
| 1779 | 677716515794329600 | NaN | NaN | 2015-12-18 05:06:23 +0000 | <a href="http://twitter.com/download/iphone" r... | IT'S PUPPERGEDDON. Total of 144/120 ...I think... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/677716515... | 144 | 120 | None | None | None | None | None |
| 2074 | 670842764863651840 | NaN | NaN | 2015-11-29 05:52:33 +0000 | <a href="http://twitter.com/download/iphone" r... | After so many requests... here you go.\n\nGood... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/670842764... | 420 | 10 | None | None | None | None | None |
twitter_archive.sample(5)
| tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 939 | 753039830821511168 | NaN | NaN | 2016-07-13 01:34:21 +0000 | <a href="http://vine.co" rel="nofollow">Vine -... | So this just changed my life. 13/10 please enj... | NaN | NaN | NaN | https://vine.co/v/5W2Dg3XPX7a | 13 | 10 | None | None | None | None | None |
| 184 | 856526610513747968 | 8.558181e+17 | 4.196984e+09 | 2017-04-24 15:13:52 +0000 | <a href="http://twitter.com/download/iphone" r... | THIS IS CHARLIE, MARK. HE DID JUST WANT TO SAY... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/856526610... | 14 | 10 | None | None | None | None | None |
| 1617 | 685169283572338688 | NaN | NaN | 2016-01-07 18:41:01 +0000 | <a href="http://twitter.com/download/iphone" r... | Meet Gerbald. He just found out he's adopted. ... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/685169283... | 11 | 10 | Gerbald | None | None | pupper | None |
| 633 | 793845145112371200 | NaN | NaN | 2016-11-02 16:00:06 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Clark. He was just caught wearing pant... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/793845145... | 13 | 10 | Clark | None | None | None | None |
| 535 | 807059379405148160 | NaN | NaN | 2016-12-09 03:08:45 +0000 | <a href="http://twitter.com/download/iphone" r... | RT @dog_rates: This is Cali. She arrived preas... | 7.829691e+17 | 4.196984e+09 | 2016-10-03 15:42:44 +0000 | https://twitter.com/dog_rates/status/782969140... | 12 | 10 | Cali | None | None | None | None |
twitter_archive.tweet_id.duplicated().sum()
0
twitter_archive[twitter_archive.text.str.contains('RT') & twitter_archive.text.str.contains('@')]
| tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 19 | 888202515573088257 | NaN | NaN | 2017-07-21 01:02:36 +0000 | <a href="http://twitter.com/download/iphone" r... | RT @dog_rates: This is Canela. She attempted s... | 8.874740e+17 | 4.196984e+09 | 2017-07-19 00:47:34 +0000 | https://twitter.com/dog_rates/status/887473957... | 13 | 10 | Canela | None | None | None | None |
| 32 | 886054160059072513 | NaN | NaN | 2017-07-15 02:45:48 +0000 | <a href="http://twitter.com/download/iphone" r... | RT @Athletics: 12/10 #BATP https://t.co/WxwJmv... | 8.860537e+17 | 1.960740e+07 | 2017-07-15 02:44:07 +0000 | https://twitter.com/dog_rates/status/886053434... | 12 | 10 | None | None | None | None | None |
| 36 | 885311592912609280 | NaN | NaN | 2017-07-13 01:35:06 +0000 | <a href="http://twitter.com/download/iphone" r... | RT @dog_rates: This is Lilly. She just paralle... | 8.305833e+17 | 4.196984e+09 | 2017-02-12 01:04:29 +0000 | https://twitter.com/dog_rates/status/830583320... | 13 | 10 | Lilly | None | None | None | None |
| 68 | 879130579576475649 | NaN | NaN | 2017-06-26 00:13:58 +0000 | <a href="http://twitter.com/download/iphone" r... | RT @dog_rates: This is Emmy. She was adopted t... | 8.780576e+17 | 4.196984e+09 | 2017-06-23 01:10:23 +0000 | https://twitter.com/dog_rates/status/878057613... | 14 | 10 | Emmy | None | None | None | None |
| 73 | 878404777348136964 | NaN | NaN | 2017-06-24 00:09:53 +0000 | <a href="http://twitter.com/download/iphone" r... | RT @dog_rates: Meet Shadow. In an attempt to r... | 8.782815e+17 | 4.196984e+09 | 2017-06-23 16:00:04 +0000 | https://www.gofundme.com/3yd6y1c,https://twitt... | 13 | 10 | Shadow | None | None | None | None |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1023 | 746521445350707200 | NaN | NaN | 2016-06-25 01:52:36 +0000 | <a href="http://twitter.com/download/iphone" r... | RT @dog_rates: This is Shaggy. He knows exactl... | 6.678667e+17 | 4.196984e+09 | 2015-11-21 00:46:50 +0000 | https://twitter.com/dog_rates/status/667866724... | 10 | 10 | Shaggy | None | None | None | None |
| 1043 | 743835915802583040 | NaN | NaN | 2016-06-17 16:01:16 +0000 | <a href="http://twitter.com/download/iphone" r... | RT @dog_rates: Extremely intelligent dog here.... | 6.671383e+17 | 4.196984e+09 | 2015-11-19 00:32:12 +0000 | https://twitter.com/dog_rates/status/667138269... | 10 | 10 | None | None | None | None | None |
| 1242 | 711998809858043904 | NaN | NaN | 2016-03-21 19:31:59 +0000 | <a href="http://twitter.com/download/iphone" r... | RT @twitter: @dog_rates Awesome Tweet! 12/10. ... | 7.119983e+17 | 7.832140e+05 | 2016-03-21 19:29:52 +0000 | https://twitter.com/twitter/status/71199827977... | 12 | 10 | None | None | None | None | None |
| 2259 | 667550904950915073 | NaN | NaN | 2015-11-20 03:51:52 +0000 | <a href="http://twitter.com" rel="nofollow">Tw... | RT @dogratingrating: Exceptional talent. Origi... | 6.675487e+17 | 4.296832e+09 | 2015-11-20 03:43:06 +0000 | https://twitter.com/dogratingrating/status/667... | 12 | 10 | None | None | None | None | None |
| 2260 | 667550882905632768 | NaN | NaN | 2015-11-20 03:51:47 +0000 | <a href="http://twitter.com" rel="nofollow">Tw... | RT @dogratingrating: Unoriginal idea. Blatant ... | 6.675484e+17 | 4.296832e+09 | 2015-11-20 03:41:59 +0000 | https://twitter.com/dogratingrating/status/667... | 5 | 10 | None | None | None | None | None |
181 rows × 17 columns
image_predict.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2075 entries, 0 to 2074 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 tweet_id 2075 non-null int64 1 jpg_url 2075 non-null object 2 img_num 2075 non-null int64 3 p1 2075 non-null object 4 p1_conf 2075 non-null float64 5 p1_dog 2075 non-null bool 6 p2 2075 non-null object 7 p2_conf 2075 non-null float64 8 p2_dog 2075 non-null bool 9 p3 2075 non-null object 10 p3_conf 2075 non-null float64 11 p3_dog 2075 non-null bool dtypes: bool(3), float64(3), int64(2), object(4) memory usage: 152.1+ KB
image_predict.sample(5)
| tweet_id | jpg_url | img_num | p1 | p1_conf | p1_dog | p2 | p2_conf | p2_dog | p3 | p3_conf | p3_dog | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1841 | 838083903487373313 | https://pbs.twimg.com/media/C6F42cGUYAAIKsX.jpg | 2 | chow | 0.800975 | True | seat_belt | 0.164133 | False | Pomeranian | 0.017981 | True |
| 613 | 680145970311643136 | https://pbs.twimg.com/media/CXBdJxLUsAAWql2.jpg | 1 | miniature_poodle | 0.457117 | True | toy_poodle | 0.226481 | True | Maltese_dog | 0.067682 | True |
| 15 | 666099513787052032 | https://pbs.twimg.com/media/CT51-JJUEAA6hV8.jpg | 1 | Lhasa | 0.582330 | True | Shih-Tzu | 0.166192 | True | Dandie_Dinmont | 0.089688 | True |
| 2000 | 876120275196170240 | https://pbs.twimg.com/media/DCiavj_UwAAcXep.jpg | 1 | Bernese_mountain_dog | 0.534327 | True | Saint_Bernard | 0.346312 | True | Greater_Swiss_Mountain_dog | 0.094933 | True |
| 1252 | 747963614829678593 | https://pbs.twimg.com/media/CmFM7ngXEAEitfh.jpg | 1 | kelpie | 0.307672 | True | Irish_terrier | 0.197486 | True | dingo | 0.105475 | False |
image_predict.tweet_id.duplicated().sum()
0
image_predict.describe()
| tweet_id | img_num | p1_conf | p2_conf | p3_conf | |
|---|---|---|---|---|---|
| count | 2.075000e+03 | 2075.000000 | 2075.000000 | 2.075000e+03 | 2.075000e+03 |
| mean | 7.384514e+17 | 1.203855 | 0.594548 | 1.345886e-01 | 6.032417e-02 |
| std | 6.785203e+16 | 0.561875 | 0.271174 | 1.006657e-01 | 5.090593e-02 |
| min | 6.660209e+17 | 1.000000 | 0.044333 | 1.011300e-08 | 1.740170e-10 |
| 25% | 6.764835e+17 | 1.000000 | 0.364412 | 5.388625e-02 | 1.622240e-02 |
| 50% | 7.119988e+17 | 1.000000 | 0.588230 | 1.181810e-01 | 4.944380e-02 |
| 75% | 7.932034e+17 | 1.000000 | 0.843855 | 1.955655e-01 | 9.180755e-02 |
| max | 8.924206e+17 | 4.000000 | 1.000000 | 4.880140e-01 | 2.734190e-01 |
image_predict.jpg_url.duplicated().sum()
66
df_tweet.tail()
| tweet_id | retweet_count | favorite_count | |
|---|---|---|---|
| 2349 | 666049248165822465 | 41 | 111 |
| 2350 | 666044226329800704 | 147 | 311 |
| 2351 | 666033412701032449 | 47 | 128 |
| 2352 | 666029285002620928 | 48 | 132 |
| 2353 | 666020888022790149 | 532 | 2535 |
df_tweet.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2354 entries, 0 to 2353 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 tweet_id 2354 non-null int64 1 retweet_count 2354 non-null int64 2 favorite_count 2354 non-null int64 dtypes: int64(3) memory usage: 55.3 KB
df_tweet.tweet_id.duplicated().sum()
0
ASSESSMENT SUMMARY
Quality
twitter_archive
image_predict
Tidiness
twitter_archive
CLEANING DATA
# Making copies of dataframes
df_image = image_predict.copy()
df_archive = twitter_archive.copy()
df_tweets = df_tweet.copy()
# Extracting the retweets
retweets = df_archive.retweeted_status_id.notnull()
# Dropping the retweets
df_archive = df_archive[~retweets]
# Extracting the replies
replies = df_archive.in_reply_to_status_id.notnull()
# Dropping replies
df_archive = df_archive[~replies]
# dropping redundant columns
df_archive.drop(['in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id', 'retweeted_status_user_id',
'retweeted_status_timestamp', 'expanded_urls', 'source','name'], axis=1, inplace=True)
# converting timestamp to a datetime data type
df_archive['timestamp'] = pd.to_datetime(df_archive.timestamp)
# Creating a single column for rating
df_archive['rating'] = df_archive.rating_numerator / df_archive.rating_denominator
df_archive.drop(['rating_numerator','rating_denominator'], axis=1, inplace=True)
# Unpivoring the dog names columns
# use a for loop to replace all the 'None'
stage = ['doggo','pupper', 'floofer', 'puppo' ]
for dog in stage:
df_archive[dog] = df_archive[dog].replace("None", '')
# Creating a new column for dog stage
df_archive['dog_stage'] = df_archive[['doggo', 'floofer', 'pupper','puppo']].agg(''.join, axis=1)
# drop the four old colomns
df_archive = df_archive.drop(['doggo','floofer','pupper','puppo'], axis = 1)
# use np.nan to fill the empty dog stage cells
df_archive['dog_stage'] = df_archive['dog_stage'].replace('', np.nan)
df_archive.dtypes
tweet_id int64 timestamp datetime64[ns, UTC] text object rating float64 dog_stage object dtype: object
df_archive.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2097 entries, 0 to 2355 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 tweet_id 2097 non-null int64 1 timestamp 2097 non-null datetime64[ns, UTC] 2 text 2097 non-null object 3 rating 2097 non-null float64 4 dog_stage 336 non-null object dtypes: datetime64[ns, UTC](1), float64(1), int64(1), object(2) memory usage: 98.3+ KB
df_archive.sample(5)
| tweet_id | timestamp | text | rating | dog_stage | |
|---|---|---|---|---|---|
| 1575 | 687476254459715584 | 2016-01-14 03:28:06+00:00 | This is Curtis. He's a fluffball. 11/10 would ... | 1.1 | pupper |
| 1157 | 724405726123311104 | 2016-04-25 01:12:38+00:00 | This is Ashleigh. She's having Coachella withd... | 1.0 | pupper |
| 2049 | 671504605491109889 | 2015-12-01 01:42:28+00:00 | This is Charlie. He was just informed that dog... | 1.1 | NaN |
| 707 | 785515384317313025 | 2016-10-10 16:20:36+00:00 | Today, 10/10, should be National Dog Rates Day | 1.0 | NaN |
| 1571 | 687704180304273409 | 2016-01-14 18:33:48+00:00 | Say hello to Blakely. He thinks that's a hat. ... | 0.9 | pupper |
df_archive[df_archive.text.str.contains('RT') & df_archive.text.str.contains('@')]
| tweet_id | timestamp | text | rating | dog_stage |
|---|
df_archive.dog_stage.value_counts()
pupper 221 doggo 72 puppo 23 floofer 9 doggopupper 9 doggopuppo 1 doggofloofer 1 Name: dog_stage, dtype: int64
# drop duplicated image jpg urls
df_image = df_image.drop_duplicates(subset='jpg_url', keep='first')
# the prediction of dog columns into one column
condition_list = [(df_image['p1_dog']==True),
(df_image['p2_dog']==True),
(df_image['p3_dog']==True)]
selection_list = [(df_image['p1']),
(df_image['p2']),
(df_image['p3'])
]
df_image['dog_type'] = np.select(condition_list, selection_list, default = np.nan)
# converting the three confidence colmns into one
condition_list = [(df_image['p1_dog']==True),
(df_image['p2_dog']==True),
(df_image['p3_dog']==True)]
selection_list = [(df_image['p1_conf']),
(df_image['p2_conf']),
(df_image['p3_conf'])
]
df_image['confidence'] = np.select(condition_list, selection_list, default = np.nan)
df_image['dog_type'] = df_image.dog_type.str.lower()
df_image = df_image.drop(['p1','p1_dog','p1_conf','p2','p2_dog','p2_conf','p3','p3_dog','p3_conf'], axis=1)
df_clean = pd.merge(df_tweet,df_image, left_on='tweet_id', right_on='tweet_id', how='inner')
# [since text and time from twitter archived will be used for my analysis,]
# perform an inner join to get rid of rows of tweets without [text and timestamp]
df_final = pd.merge(df_clean,df_archive, left_on='tweet_id', right_on='tweet_id', how='inner')
# Reordering the columns in the final dataset before storing
df_final_clean = df_final[['tweet_id','timestamp','text','jpg_url','img_num','dog_type','dog_stage','rating',
'confidence','retweet_count','favorite_count']]
df_image.jpg_url.duplicated().sum()
0
df_image.dog_type.value_counts()
golden_retriever 158
labrador_retriever 108
pembroke 95
chihuahua 91
pug 63
...
scotch_terrier 1
entlebucher 1
japanese_spaniel 1
standard_schnauzer 1
bouvier_des_flandres 1
Name: dog_type, Length: 113, dtype: int64
df_clean.sample(5)
| tweet_id | retweet_count | favorite_count | jpg_url | img_num | dog_type | confidence | |
|---|---|---|---|---|---|---|---|
| 985 | 710272297844797440 | 1425 | 4945 | https://pbs.twimg.com/media/Cdtk414WoAIUG0v.jpg | 1 | old_english_sheepdog | 0.586307 |
| 677 | 757725642876129280 | 1391 | 5022 | https://pbs.twimg.com/media/CoP7c4bWcAAr55g.jpg | 2 | labrador_retriever | 0.128128 |
| 401 | 809220051211603969 | 6554 | 22246 | https://pbs.twimg.com/media/CzrtWDbWEAAmIhy.jpg | 1 | pomeranian | 0.819511 |
| 718 | 751251247299190784 | 6695 | 13791 | https://pbs.twimg.com/ext_tw_video_thumb/75125... | 1 | walker_hound | 0.178852 |
| 1840 | 668986018524233728 | 183 | 578 | https://pbs.twimg.com/media/CUi3PIrWoAAPvPT.jpg | 1 | chihuahua | 0.005640 |
df_clean.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2008 entries, 0 to 2007 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 tweet_id 2008 non-null int64 1 retweet_count 2008 non-null int64 2 favorite_count 2008 non-null int64 3 jpg_url 2008 non-null object 4 img_num 2008 non-null int64 5 dog_type 1691 non-null object 6 confidence 1691 non-null float64 dtypes: float64(1), int64(4), object(2) memory usage: 125.5+ KB
df_tweets.sample(5)
| tweet_id | retweet_count | favorite_count | |
|---|---|---|---|
| 1138 | 727685679342333952 | 720 | 3206 |
| 331 | 832769181346996225 | 43 | 0 |
| 1228 | 713411074226274305 | 1440 | 4802 |
| 2109 | 670442337873600512 | 213 | 690 |
| 1184 | 718540630683709445 | 1137 | 2730 |
df_tweets.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2354 entries, 0 to 2353 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 tweet_id 2354 non-null int64 1 retweet_count 2354 non-null int64 2 favorite_count 2354 non-null int64 dtypes: int64(3) memory usage: 55.3 KB
df_final_clean.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1971 entries, 0 to 1970 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 tweet_id 1971 non-null int64 1 timestamp 1971 non-null datetime64[ns, UTC] 2 text 1971 non-null object 3 jpg_url 1971 non-null object 4 img_num 1971 non-null int64 5 dog_type 1666 non-null object 6 dog_stage 303 non-null object 7 rating 1971 non-null float64 8 confidence 1666 non-null float64 9 retweet_count 1971 non-null int64 10 favorite_count 1971 non-null int64 dtypes: datetime64[ns, UTC](1), float64(2), int64(4), object(4) memory usage: 184.8+ KB
df_final_clean.tail()
| tweet_id | timestamp | text | jpg_url | img_num | dog_type | dog_stage | rating | confidence | retweet_count | favorite_count | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1966 | 666049248165822465 | 2015-11-16 00:24:50+00:00 | Here we have a 1949 1st generation vulpix. Enj... | https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg | 1 | miniature_pinscher | NaN | 0.5 | 0.560311 | 41 | 111 |
| 1967 | 666044226329800704 | 2015-11-16 00:04:52+00:00 | This is a purebred Piers Morgan. Loves to Netf... | https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg | 1 | rhodesian_ridgeback | NaN | 0.6 | 0.408143 | 147 | 311 |
| 1968 | 666033412701032449 | 2015-11-15 23:21:54+00:00 | Here is a very happy pup. Big fan of well-main... | https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg | 1 | german_shepherd | NaN | 0.9 | 0.596461 | 47 | 128 |
| 1969 | 666029285002620928 | 2015-11-15 23:05:30+00:00 | This is a western brown Mitsubishi terrier. Up... | https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg | 1 | redbone | NaN | 0.7 | 0.506826 | 48 | 132 |
| 1970 | 666020888022790149 | 2015-11-15 22:32:08+00:00 | Here we have a Japanese Irish Setter. Lost eye... | https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg | 1 | welsh_springer_spaniel | NaN | 0.8 | 0.465074 | 532 | 2535 |
SAVING THE FINAL DATASET TO A CSV FILE
df_final_clean.to_csv('twitter_archive_master.csv', index=False)
DATA ANALYSIS AND VISUALIZATIONS
Is there a correlation among Rating, Retweet Count and Favorite Count?
fig, ax = plt.subplots(figsize=(15,10))
sns.heatmap(df_final_clean[['rating','retweet_count','favorite_count']].corr(), ax=ax, annot=True);
Which Dog types were correctly predicted the most? Top 3
df_final_clean.dog_type.value_counts()[:3]
golden_retriever 156 labrador_retriever 106 pembroke 94 Name: dog_type, dtype: int64
# Getting an image of the most liked golden_retriever picture
golden_retriever = df_final_clean.query('dog_type == "golden_retriever"').sort_values(by = 'favorite_count',ascending = False)
#Getting the url of golden retriever's most liked picture
url = golden_retriever.jpg_url.iloc[0]
response = requests.get(url)
Image.open(BytesIO(response.content))
An image of a Golden Retriever with the most Favorite Count
# Getting an image of the most liked labrador_retriever picture
golden_retriever = df_final_clean.query('dog_type == "labrador_retriever"').sort_values(by = 'favorite_count',ascending = False)
#Getting the url of golden retriever's most liked picture
url = golden_retriever.jpg_url.iloc[0]
response = requests.get(url)
Image.open(BytesIO(response.content))
An Image of the Labrador Retriever with the most Favorite Count
# Getting an image of the most liked pembroke picture
golden_retriever = df_final_clean.query('dog_type == "pembroke"').sort_values(by = 'favorite_count',ascending = False)
#Getting the url of golden retriever's most liked picture
url = golden_retriever.jpg_url.iloc[0]
response = requests.get(url)
Image.open(BytesIO(response.content))
An image of a Pembroke with the most Favorite Count
What does our raters say in their tweets?
#creatinh a function to clean hyperlinks from text column in df_final_clean
def cleanTxt(text):
text = re.sub(r'https?:\/\/\S+', '', text)
return text
# Cleaning the column
df_final_clean['text'] = df_final_clean.text.apply(cleanTxt)
text = ' '.join( [twts for twts in df_final_clean.text]) #To join all tweet
# generate word cloud
wordcloud = WordCloud(width=1000, height = 600,
max_words = 100,
stopwords = stopwords,
background_color="black").generate(text)
# Display the generated image:
plt.figure( figsize=(20,10), facecolor='k' )
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off");
Wordcloud of the content of tweets
Which dog type and image received the highest rating?
df_final_clean.rating.max()
177.6
df_final_clean.query('rating == 177.60')
| tweet_id | timestamp | text | jpg_url | img_num | dog_type | dog_stage | rating | confidence | retweet_count | favorite_count | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 722 | 749981277374128128 | 2016-07-04 15:00:45+00:00 | This is Atticus. He's quite simply America af.... | https://pbs.twimg.com/media/CmgBZ7kWcAAlzFD.jpg | 1 | NaN | NaN | 177.6 | NaN | 2772 | 5569 |
#Getting the url of the dog with the highly rated dog
url = df_final_clean.query('rating == 177.60').jpg_url.iloc[0]
response = requests.get(url)
Image.open(BytesIO(response.content))
Highly rated dog (dog type unknown because prediction was incorrect)
Which dog type image received the highest favorite count?
df_final_clean.favorite_count.max()
132810
df_final_clean.query('favorite_count == 132810')
| tweet_id | timestamp | text | jpg_url | img_num | dog_type | dog_stage | rating | confidence | retweet_count | favorite_count | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 306 | 822872901745569793 | 2017-01-21 18:26:02+00:00 | Here's a super supportive puppo participating ... | https://pbs.twimg.com/media/C2tugXLXgAArJO4.jpg | 1 | lakeland_terrier | puppo | 1.3 | 0.196015 | 48265 | 132810 |
#Getting the url of the dog with the highest favorite count
url = df_final_clean.query('favorite_count == 132810').jpg_url.iloc[0]
response = requests.get(url)
Image.open(BytesIO(response.content))
An image of a Lakeland Terrier [Puppo]
Which dog type image received the highest retweet count?
df_final_clean.retweet_count.max()
79515
df_final_clean.query('retweet_count == 79515')
| tweet_id | timestamp | text | jpg_url | img_num | dog_type | dog_stage | rating | confidence | retweet_count | favorite_count | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 769 | 744234799360020481 | 2016-06-18 18:26:18+00:00 | Here's a doggo realizing you can stand in a po... | https://pbs.twimg.com/ext_tw_video_thumb/74423... | 1 | labrador_retriever | doggo | 1.3 | 0.825333 | 79515 | 131075 |
#Getting the url of the dog with the highest favorite count
url = df_final_clean.query('retweet_count == 79515').jpg_url.iloc[0]
response = requests.get(url)
Image.open(BytesIO(response.content))
An image of a Labrado Retriever [Doggo]
Importing a Dashboard created with Microsoft PowerBI for the cleaned dataset
Dashboard = IFrame(src="https://app.powerbi.com/view?r=eyJrIjoiM2NmMGI2YjItOWU1YS00OWZjLWIxYTAtNGUyNWVjYzNmNDZmIiwidCI6ImE5NjMwYTViLTA5M2EtNDM5Yy04NjM5LThhYmJmMzRhN2M5NyJ9", width = 1000, height = 600)
display(Dashboard)
Display of a dashboard with the master data using Miscrosoft Power BI